if (!require("pacman")) install.packages("pacman")# use this line for installing/loading pacman::p_load(tidyverse, palmerpenguins, here, cowgrid) install.packages("openintro")
package 'openintro' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\matto\AppData\Local\Temp\RtmpcV01DQ\downloaded_packages
accidents_data <-read_csv(here("data", "accidents.csv"))##glimpse(accidents_data) #Quick glimpse of the data#Converts day of week (monday, friday, etc.) to a weekend or weekdayaccidents_data <- accidents_data |>mutate(weekday_or_weekend =ifelse(day_of_week %in%c("Saturday", "Sunday"), "Weekend", "Weekday"))#Density plotggplot(accidents_data, aes(x = time, fill = severity)) +geom_density(alpha =0.5, adjust =1) +facet_wrap(~weekday_or_weekend, nrow=2) +scale_fill_manual(values =c("Fatal"="#9370DB", "Serious"="#87CEEB", "Slight"="#FFFFE0")) +theme_minimal() +labs(x ="Time of Day",y ="Density",title ="Number of Accidents throughout the day\nBy day of week and severity" )
2 - NYC marathon winners
library("cowplot") #I used this library for plot_grid() to stack the plots.nyc_marathon_data <-read_csv(here("data", "nyc_marathon.csv"))#Quick glimpse of the data#glimpse(nyc_marathon_data) #Code for the histogramhistogram =ggplot(nyc_marathon_data, aes(x = time)) +geom_histogram(binwidth =75)+labs(x ="Marathon Completion Time" )#Code for the boxplot, removing the extra axis info so it looks cleaner stacked with histogramboxplot =ggplot(nyc_marathon_data, aes(x = time)) +geom_boxplot()+labs(title ="A. Boxplot and Histogram")+theme_minimal(base_size =14)+theme(axis.line=element_blank(),axis.ticks=element_blank(),axis.title.y=element_blank(),axis.title.x=element_blank(),axis.text.y=element_blank())#This function stacks the boxplot and histogram on top of each otherplot_grid(boxplot, histogram, align ="v", axis ="l", ncol =1, heights =c(1, 3))
#Answer to Part A: The histogram gives a better visual of the distribution of runners that completed the marathon. We can see a more clear visual represenatation of the completion times. The boxplot, however, gives us a more clear indication of the median points, outliers, and quartiles. The boxplot lacks the visualization of the distrubution, while the histogram lacks the statistical information such as median, outliers, and quartiles.#Part B Codeggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +geom_boxplot()+scale_fill_manual(values =c("Men"="Blue", "Women"="Red"))+theme_minimal(base_size =14)+labs(title ="Men vs Women Marathon Times",x ="Marathon Time by Hour",y ="Division by Gender")
#Answer to Part B: Compared to women, men have a distribution range of marathon times. Men also have a smaller quartile range, and on average, a faster completion time. Women have more outlier points.#Part C Codeggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +geom_boxplot(show.legend =FALSE,fill ="gray")+theme(axis.title.y=element_blank())+theme_minimal(base_size =14)+labs(title ="Men vs Women Marathon Times",x ="Marathon Time by Hour" )
#Answer to Part C: The separation by Gender already speaks for itself a bit. I removed the legend and y-axis label. I also decided to remove the color and just use a default gray fill, although I suppose I could have kept the blue vs red/pink in. Those are sort of historical default "gender" color separators. My changes affect the data-to-ink ratio by cutting down a lot of excess colors and information in order to simplyify the visual.#Part D Codeggplot(nyc_marathon_data, aes(x = year, y = time_hrs, color = division, shape = division))+geom_point()+geom_line(aes(group = division))+##necessary to connect the pointsscale_color_manual(values =c("Men"="Blue", "Women"="Red"))+labs(title ="Marathon Times by Year",x ="Year",y ="Marathon Time",color ="Division",shape ="Division")+theme_minimal(base_size =14)
#Answer to Part D: What is more visible in this plot compared to the others is the trend. We can visualize the marathon times by year, especially noting the marathon times improving significantly since 1970.
3 - US counties
#Setup#This filters out the N/A values to remove any warning messagescounty <- countyfilter(county,!is.na(homeownership), !is.na(poverty))
#Part A: The following code attempts to create a boxplot with categorical data of smoking ban in the x-axis and numerical pop in the y-axis. It also tries to layer this with a scatterplot of numerical median income data, and categorical median education data. This code will output a plot but it does not really work correctly nor does it make sense. First off, geom_point will create a scatterplot, which is expecting two continuous variables. median_edu is not a continuous variable, so this will cause issues. I think the boxplot is fine, but combining it with a scatterplot does not make sense and will lead to a confusing graphic. suppressWarnings(ggplot(county) +geom_point(aes(x = median_edu, y = median_hh_income))+geom_boxplot(aes(x = smoking_ban, y = pop2017)) )
#Part B: The second plot (the one where the median_edu columns are stacked vertically) makes it much easier to compare. The spacing is better which makes it easier to read. It's also easier to read the charts up from down rather than side to side. This means the placement of the faceting variable is dependant on what we want to compare. In this case, we want to see the poverty levels across people from different median education levels, so placing the median education levels in vertical columns makes the most sense.suppressWarnings(ggplot(county %>%filter(!is.na(median_edu))) +geom_point(aes(x = homeownership, y = poverty)) +facet_grid(median_edu ~ .))
#Plot B:ggplot(county, aes(x = homeownership, y = poverty))+geom_point(size =2)+geom_smooth(method ="lm", formula = y ~poly(x, 2), color ="blue", se =FALSE)+theme_gray()+labs(title ="Plot B",x ="homeownership",y ="poverty" )
#Plot C:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point()+geom_smooth(se =FALSE, color ="green")+labs(title ="Plot C",x ="homeownership",y ="poverty" )
#Plot D:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_smooth(se =FALSE, color ="blue")+geom_point()+labs(title ="Plot D",x ="homeownership",y ="poverty" )
#Plot E:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point(aes(color = metro), size =2)+geom_smooth(aes(linetype = metro), se =FALSE)+scale_linetype_manual(values =c("no"="solid", "yes"="dashed"))+theme_gray()+labs(title ="Plot E",x ="homeownership",y ="poverty" )
#Plot F:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point(aes(color = metro), size =2)+geom_smooth(aes(color = metro), se =FALSE)+theme_gray()+labs(title ="Plot F",x ="homeownership",y ="poverty" )
#Plot G:ggplot(county, aes(x = homeownership, y = poverty))+geom_point(aes(color = metro), size =2)+geom_smooth(method ="lm", formula = y ~poly(x, 2), color ="blue", se =FALSE)+theme_gray()+labs(title ="Plot G",x ="homeownership",y ="poverty" )
credit_data <-read_csv(here("data", "credit.csv"))#Glimpse of Data#glimpse(credit_data)#Part A: There appears to be a relationship between credit card balance and income. Generally, higher income means credit card balance will also be higher. This relationship varies a little depending on the student/married factors. Let's start with students. Students tend to have a higher credit card balance than non-students, so that shifts the starting point of the slope. Students also appear to have a slightly flatter slope. Non-married has a little steeper slope compared to married, but married income caps out higher.#Note: I used this website to figure out how to label the axis: https://datavizpyr.com/dollar-format-for-axis-labels-with-ggplot2/ggplot(credit_data, aes(x = income, y = balance, color = student, shape = student))+geom_point(size =2, alpha = .5)+geom_smooth(method ="lm", aes(color = student), se =FALSE)+theme_gray()+theme(legend.position ="none")+facet_grid(rows =vars(student), cols =vars (married), label =labeller(student =c("No"="student: No", "Yes"="student: Yes"),married =c("No"="married: No", "Yes"="married: Yes"))) +labs(x ="Income",y ="Credit Card Balance" )+scale_x_continuous(labels = scales::dollar_format(scale =1000))+scale_y_continuous(labels = scales::dollar_format())
#Part B: I think that these factors are useful indicators. For example, I would expect a student to have a higher credit card balance compared to a non-student. I would also expect married income to cap out much higher compared to non-married incomes.#Part C#First, make the new column and calculate the variablecredit_data$credit_utilization = credit_data$balance / credit_data$limitggplot(credit_data, aes(x = income, y = credit_utilization, color = student, shape = student))+geom_point(size =2, alpha = .5)+geom_smooth(method ="lm", aes(color = student), se =FALSE)+theme_gray()+theme(legend.position ="none")+facet_grid(rows =vars(student), cols =vars (married), label =labeller(student =c("No"="student: No", "Yes"="student: Yes"),married =c("No"="married: No", "Yes"="married: Yes"))) +labs(x ="Income",y ="Credit Utilization" )+scale_x_continuous(labels = scales::dollar_format(scale =1000))+scale_y_continuous(labels = scales::percent_format())
#Part D: This new plot implies that credit utilization is affected by marital status and student status, not simply just income. Students tend to use their credit more, even with higher levels of income. Non-students tend to utilize less credit, but slightly increase the use with higher income. Married credit use seems to be a lot more flat across the board compared to non-married credit, which has more variance but is also dependent on student vs non-student status.